Introduction

This vignette demonstrates how to identify malignant cells and perform anchor-based cell type classification using the CosMx SMI NSCLC FFPE Dataset, Lung 5-1 sample. We employ the FindMalignantCells function to detect malignant cells based on predefined marker genes and subsequently apply RunABCT for cell type annotation using an anchor-based classification approach. This workflow showcases the integration of spatial transcriptomics data with computational methods for accurate cell type identification.

Loading Required Libraries

Ensure that all necessary libraries are installed and loaded.

library(Seurat)
library(SeuratWrappers)
library(Banksy)
library(harmony)
library(dplyr)
library(scales)
library(EnvStats)
library(stringr)
library(matrixStats)
library(InSituType)
library(UCell)
library(ggplot2)
library(MASS)
library(pracma)

source("./ABCT.r")

Loading and Preparing Data

Load the data for analysis. Here, we demonstrate using a Seurat object. The processed data can be downloaded from https://kbds.re.kr/hissta/datasetinfo?sampleIdx=19.

obj <- readRDS(paste0("/path/to/your/data/", "obj_final.rds"))

Define Cell Type List and Marker Genes

We define a list of cell types and specify marker genes for the analysis.

celltype_list <- c("Malignant", "Epithelial", "CD4T", "CD8T", "NK", "B", "Plasma",
                   "Macrophage", "Monocyte", "DC", "Mast", "Neutrophil", 
                   "Endothelial", "Fibroblast", "Unknown")

names(celltype_list) <- c(
  "#bf4040", "#FB8072", "#8DD3C7", "#FFFFB3", "#BEBADA", "#3288bd", 
  "#FDB462", "#B3DE69", "#FCCDE5", "#84cdee", "#BC80BD", "#CCEBC5", 
  "#FFED6F", "#1b9e77", "gray"
)

malignant_marker_list <- data.frame(
  cluster = "Malignant",
  gene = c("SOX9", "FGFR1", "KRAS", "MYC", "EGFR")
)

ABCT_marker_list <- read.csv("/path/to/marker_list.csv")
ABCT_marker_list$cluster <- factor(ABCT_marker_list$cluster, levels = celltype_list)
ABCT_marker_list <- ABCT_marker_list %>% arrange(cluster)

Identifying Malignant Cells

Use the FindMalignantCells function to identify malignant cells in the data.

malignant_path <- "/path/to/output/malignant/"

obj <- FindMalignantCells(
  obj,
  assay = "SCT",
  ctrl_assay = "negprobes",
  marker_list = malignant_marker_list,
  use_spatial = TRUE,
  M = 1,
  lambda = 0.2,
  area = NULL,
  w_neg = 0,
  dimx = "x_global_px",
  dimy = "y_global_px",
  smooth_reduction = "spatial_pca",
  path = malignant_path
)
#> Malignant: 
#> SOX9, FGFR1, KRAS, MYC, EGFR, SOX9.m0, FGFR1.m0, KRAS.m0, MYC.m0, EGFR.m0
#> Smoothing module score (UCell)
#> npcs: 30
#> Saving 10 x 5 in image
#> Coordinate system already present. Adding new coordinate system, which will replace
#> the existing one.

Performing ABCT Classification

Subsequently, classify the non-malignant cells using the RunABCT function. By default, if a BANKSY object already exists in the Seurat object, RunABCT will use the existing BANKSY object rather than running it again. If you wish to rerun BANKSY on a subsetted object, you should remove the existing BANKSY assay before executing the function.

subobj <- subset(obj, subset = malignant_result != "Malignant")
#> Warning: Not validating FOV objects
#> Warning: Not validating Centroids objects
#> Not validating Centroids objects
#> Warning: Not validating FOV objects
#> Not validating FOV objects
#> Not validating FOV objects
#> Warning: Not validating Seurat objects

abct_path <- "/path/to/output/abct/"

subobj <- RunABCT(
  subobj,
  assay = "SCT",
  ctrl_assay = "negprobes",
  marker_list = ABCT_marker_list,
  color_list = names(celltype_list[celltype_list %in% ABCT_marker_list$cluster]),
  method = "quantile",
  use_spatial = TRUE,
  M = 1,
  lambda = 0.2,
  dimx = "x_global_px",
  dimy = "y_global_px",
  smooth_reduction = "spatial_pca",
  path = abct_path
)
#> Calculating UCell module score
#> Epithelial: 
#> KRT7, KRT8, KRT19, EPCAM, PIGR, KRT7.m0, KRT8.m0, KRT19.m0, EPCAM.m0, PIGR.m0
#> CD4T: 
#> CD2, CD3D, CD3E, CD3G, CD4, LTB, IL7R, CD2.m0, CD3D.m0, CD3E.m0, CD3G.m0, CD4.m0, LTB.m0, IL7R.m0
#> CD8T: 
#> CD2, CD3D, CD3E, CD3G, CD8A, CD8B, CCL5, CD2.m0, CD3D.m0, CD3E.m0, CD3G.m0, CD8A.m0, CD8B.m0, CCL5.m0
#> NK: 
#> NKG7, GNLY, KLRB1, PRF1, NKG7.m0, GNLY.m0, KLRB1.m0, PRF1.m0
#> B: 
#> MS4A1, CD79A, CD19, CD37, CD38, MS4A1.m0, CD79A.m0, CD19.m0, CD37.m0, CD38.m0
#> Plasma: 
#> MZB1, JCHAIN, TNFRSF17, IGHG1, MZB1.m0, JCHAIN.m0, TNFRSF17.m0, IGHG1.m0
#> Macrophage: 
#> CD68, C1QA, C1QC, C1QB, CD68.m0, C1QA.m0, C1QC.m0, C1QB.m0
#> Monocyte: 
#> S100A8, S100A9, LYZ, CD14, VCAN, S100A8.m0, S100A9.m0, LYZ.m0, CD14.m0, VCAN.m0
#> DC: 
#> HLA-DQA1, HLA-DQB1, HLA-DPB1, HLA-DPA1, CLEC10A, GPR183, ITGAX, CD86, HLA-DQA1.m0, HLA-DQB1.m0, HLA-DPB1.m0, HLA-DPA1.m0, CLEC10A.m0, GPR183.m0, ITGAX.m0, CD86.m0
#> Mast: 
#> CPA3, TPSB2, TPSAB1, KIT, CPA3.m0, TPSB2.m0, TPSAB1.m0, KIT.m0
#> Neutrophil: 
#> CSF3R, CXCR2, CXCL8, CSF3R.m0, CXCR2.m0, CXCL8.m0
#> Endothelial: 
#> PECAM1, VWF, PECAM1.m0, VWF.m0
#> Fibroblast: 
#> DCN, COL1A1, COL1A2, LUM, PDGFRA, DCN.m0, COL1A1.m0, COL1A2.m0, LUM.m0, PDGFRA.m0
#> ** CellType max score result **
#> 
#>  Epithelial        CD4T        CD8T          NK           B      Plasma 
#>        5059        5133         295          54        4568        5140 
#>  Macrophage    Monocyte          DC        Mast  Neutrophil Endothelial 
#>       19494        1664        2709        2369        3440        6295 
#>  Fibroblast        <NA> 
#>       23363           0
#> * Epithelial anchor
#> anchor
#> Epithelial     filter 
#>       1012      78571
#> * CD4T anchor
#> anchor
#>   CD4T filter 
#>   1027  78556
#> * CD8T anchor
#> anchor
#>   CD8T filter 
#>     59  79524
#> * NK anchor
#> anchor
#> filter     NK 
#>  79572     11
#> * B anchor
#> anchor
#>      B filter 
#>    914  78669
#> * Plasma anchor
#> anchor
#> filter Plasma 
#>  78555   1028
#> * Macrophage anchor
#> anchor
#>     filter Macrophage 
#>      75684       3899
#> * Monocyte anchor
#> anchor
#>   filter Monocyte 
#>    79250      333
#> * DC anchor
#> anchor
#>     DC filter 
#>    542  79041
#> * Mast anchor
#> anchor
#> filter   Mast 
#>  79109    474
#> * Neutrophil anchor
#> anchor
#>     filter Neutrophil 
#>      78895        688
#> * Endothelial anchor
#> anchor
#> Endothelial      filter 
#>        1259       78324
#> * Fibroblast anchor
#> anchor
#> Fibroblast     filter 
#>       4673      74910
#> Final anchor
#> merged_anchor
#>           B        CD4T        CD8T          DC Endothelial  Epithelial 
#>         914        1027          59         542        1259        1012 
#>  Fibroblast      filter  Macrophage        Mast    Monocyte  Neutrophil 
#>        4673       63664        3899         474         333         688 
#>          NK      Plasma        <NA> 
#>          11        1028           0
#> Running Insitutype
#> Probability cutoff: 0.99
#> Anchor result (clust)
#> 
#>           B        CD4T        CD8T          DC Endothelial  Epithelial 
#>        6864       10296        1640        4142        6351        4522 
#>  Fibroblast  Macrophage        Mast    Monocyte  Neutrophil          NK 
#>       15761        8272        2401        3422        8837         252 
#>      Plasma 
#>        6823
#> Anchor result (cut)
#> 
#>           B        CD4T        CD8T          DC Endothelial  Epithelial 
#>        6313        9043        1252        3433        6028        4240 
#>  Fibroblast  Macrophage        Mast    Monocyte  Neutrophil          NK 
#>       15143        7753        2279        2607        8306         182 
#>      Plasma     Unknown 
#>        6597        6407
#> Saving annotation result as dataframe (csv)
#> Saving annotation result as rds object
#> Coordinate system already present. Adding new coordinate system, which will replace
#> the existing one.
#> Coordinate system already present. Adding new coordinate system, which will replace
#> the existing one.

Updating Metadata

Finally, update the metadata in the original object with the ABCT classification results.

obj <- update_metadata(obj, subobj, celltype_list)



Session Info
#> R version 4.3.2 (2023-10-31)
#> Platform: x86_64-conda-linux-gnu (64-bit)
#> Running under: Rocky Linux 8.10 (Green Obsidian)
#> 
#> Matrix products: default
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=ko_KR.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=ko_KR.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=ko_KR.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=ko_KR.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Asia/Seoul
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] pracma_2.4.4         MASS_7.3-60          ggplot2_3.5.1       
#>  [4] UCell_2.6.2          InSituType_2.0       matrixStats_1.3.0   
#>  [7] stringr_1.5.1        EnvStats_2.8.1       scales_1.3.0        
#> [10] dplyr_1.1.4          harmony_1.2.0        Rcpp_1.0.12         
#> [13] Banksy_0.99.13       SeuratWrappers_0.3.5 Seurat_5.0.1        
#> [16] SeuratObject_5.0.1   sp_2.1-4            
#> 
#> loaded via a namespace (and not attached):
#>   [1] RcppHungarian_0.3           RcppAnnoy_0.0.22           
#>   [3] splines_4.3.2               later_1.3.2                
#>   [5] bitops_1.0-7                tibble_3.2.1               
#>   [7] R.oo_1.26.0                 polyclip_1.10-6            
#>   [9] fastDummies_1.7.3           lifecycle_1.0.4            
#>  [11] aricode_1.0.3               globals_0.16.3             
#>  [13] lattice_0.22-6              SnowballC_0.7.1            
#>  [15] magrittr_2.0.3              plotly_4.10.4.9000         
#>  [17] sass_0.4.9                  rmarkdown_2.27             
#>  [19] jquerylib_0.1.4             yaml_2.3.8                 
#>  [21] remotes_2.5.0               httpuv_1.6.15              
#>  [23] sctransform_0.4.1           askpass_1.2.0              
#>  [25] spam_2.10-0                 spatstat.sparse_3.0-3      
#>  [27] reticulate_1.37.0           cowplot_1.1.3              
#>  [29] pbapply_1.7-2               RColorBrewer_1.1-3         
#>  [31] abind_1.4-5                 zlibbioc_1.48.0            
#>  [33] Rtsne_0.17                  GenomicRanges_1.54.1       
#>  [35] purrr_1.0.2                 R.utils_2.12.3             
#>  [37] BiocGenerics_0.48.1         RCurl_1.98-1.14            
#>  [39] GenomeInfoDbData_1.2.11     IRanges_2.36.0             
#>  [41] S4Vectors_0.40.2            ggrepel_0.9.5              
#>  [43] irlba_2.3.5.1               listenv_0.9.1              
#>  [45] spatstat.utils_3.1-0        umap_0.2.10.0              
#>  [47] goftest_1.2-3               RSpectra_0.16-1            
#>  [49] spatstat.random_3.2-3       fitdistrplus_1.1-11        
#>  [51] parallelly_1.37.1           leiden_0.4.3.1             
#>  [53] codetools_0.2-20            DelayedArray_0.28.0        
#>  [55] tidyselect_1.2.1            farver_2.1.2               
#>  [57] stats4_4.3.2                spatstat.explore_3.2-7     
#>  [59] jsonlite_1.8.8              BiocNeighbors_1.20.0       
#>  [61] progressr_0.14.0            ggridges_0.5.6             
#>  [63] survival_3.6-4              systemfonts_1.1.0          
#>  [65] dbscan_1.2-0                tools_4.3.2                
#>  [67] ragg_1.3.2                  ica_1.0-3                  
#>  [69] glue_1.7.0                  gridExtra_2.3              
#>  [71] SparseArray_1.2.2           xfun_0.44                  
#>  [73] MatrixGenerics_1.14.0       GenomeInfoDb_1.38.1        
#>  [75] withr_3.0.0                 BiocManager_1.30.23        
#>  [77] fastmap_1.2.0               fansi_1.0.6                
#>  [79] openssl_2.2.0               digest_0.6.35              
#>  [81] rsvd_1.0.5                  R6_2.5.1                   
#>  [83] mime_0.12                   textshaping_0.3.7          
#>  [85] colorspace_2.1-0            scattermore_1.2            
#>  [87] sccore_1.0.5                tensor_1.5                 
#>  [89] spatstat.data_3.0-4         R.methodsS3_1.8.2          
#>  [91] utf8_1.2.4                  tidyr_1.3.1                
#>  [93] generics_0.1.3              data.table_1.15.4          
#>  [95] httr_1.4.7                  htmlwidgets_1.6.4          
#>  [97] S4Arrays_1.2.0              uwot_0.2.2                 
#>  [99] pkgconfig_2.0.3             gtable_0.3.5               
#> [101] lmtest_0.9-40               SingleCellExperiment_1.24.0
#> [103] XVector_0.42.0              htmltools_0.5.8.1          
#> [105] dotCall64_1.1-1             Biobase_2.62.0             
#> [107] png_0.1-8                   SpatialExperiment_1.12.0   
#> [109] knitr_1.46                  reshape2_1.4.4             
#> [111] rjson_0.2.21                nlme_3.1-164               
#> [113] cachem_1.1.0                zoo_1.8-12                 
#> [115] KernSmooth_2.23-24          parallel_4.3.2             
#> [117] miniUI_0.1.1.1              pillar_1.9.0               
#> [119] grid_4.3.2                  vctrs_0.6.5                
#> [121] RANN_2.6.1                  lsa_0.73.3                 
#> [123] promises_1.3.0              xtable_1.8-4               
#> [125] cluster_2.1.6               evaluate_0.23              
#> [127] magick_2.8.3                cli_3.6.2                  
#> [129] compiler_4.3.2              rlang_1.1.3                
#> [131] crayon_1.5.2                future.apply_1.11.2        
#> [133] labeling_0.4.3              mclust_6.1.1               
#> [135] plyr_1.8.9                  stringi_1.8.4              
#> [137] BiocParallel_1.36.0         viridisLite_0.4.2          
#> [139] deldir_2.0-4                munsell_0.5.1              
#> [141] lazyeval_0.2.2              spatstat.geom_3.2-9        
#> [143] Matrix_1.6-5                RcppHNSW_0.6.0             
#> [145] patchwork_1.2.0             future_1.33.2              
#> [147] shiny_1.8.1.1               highr_0.10                 
#> [149] SummarizedExperiment_1.32.0 ROCR_1.0-11                
#> [151] leidenAlg_1.1.3             igraph_2.0.3               
#> [153] bslib_0.7.0